knitr::opts_chunk$set(
  echo    = TRUE,   
  message = FALSE, 
  warning = FALSE 
)

Metaphlan Data preprocessing

We started by importing the metaphlan output, the species-level abundance matrix, and the sample metadata. The metaphlan table was transposed so that rows represent samples and columns represent taxa. We then removed a small set of unwanted samples (week 1, 12, 14, and 16 for mice B3, C3, and E2), and reformatted the remaining sample IDs to match the metadata convention.

In the metadata, we dropped backup samples with IDs containing “B3B”, “C3B”, or “E2B”, and standardized labels so that “B3A”, “C3A”, and “E2A” were replaced by “B3”, “C3”, and “E2” in both the sampleID and Mouse_ID fields. Using these harmonized IDs, we kept only samples present in both the microbiome table and the metadata and constructed a combined dataset containing the Diet and Supplement variables along with the microbial abundance profiles.

Next, we restricted the feature set to species-level taxa by keeping only columns whose names contained “|s_” and did not contain “|t_”. All microbial abundance columns were converted to numeric values. To reduce sparsity, we counted the number of zeros for each species across samples and retained only those species with zeros in fewer than 10% of samples. We then added a small pseudo-count, defined as one tenth of the smallest non-zero value in the matrix, to all zero entries.

Finally, we converted the abundance matrix to relative abundances by normalizing each row so that each row sum to one, i.e., each entry represents the proportion of that species within a sample.

raw <- read.table("data/merged_metaphlan.txt")
data <- read.table("data/new_mat_prop.txt")
meta <- read.table(
  "data/DMOF6_merged_metadata_ag.txt",
  header = TRUE,
  sep = "\t",
  fill = TRUE,
  quote = "",
  comment.char = "")
#flip row and column
raw_flipped <- as.data.frame(t(raw[,-1])) 
colnames(raw_flipped) <- raw[[1]] 

rownames(raw_flipped) <- raw_flipped$clade_name
raw_flipped$clade_name <- NULL

rows_to_remove <- c(
  "wk12_B3", "wk14_B3", "wk16_B3", "wk1_B3",
  "wk12_C3", "wk14_C3", "wk16_C3", "wk1_C3",
  "wk12_E2", "wk14_E2", "wk16_E2", "wk1_E2"
)
raw_flipped <- raw_flipped[ !(rownames(raw_flipped) %in% rows_to_remove), ]

#modify row names
filtered <- raw_flipped
rownames(filtered) <- sub("^wk(\\d+)_([A-Za-z0-9]+)$", "DMOF6_\\2_wk\\1", rownames(filtered))
library(dplyr)
library(stringr)
library(tibble)

meta_clean <- meta %>% 
  filter(!str_detect(sampleID, "B3B|E2B|C3B")) %>%     #delete 12 rows
  mutate(
    sampleID = str_replace_all(sampleID, c("B3A" = "B3", "C3A" = "C3", "E2A" = "E2")),
    Mouse_ID = str_replace_all(Mouse_ID, c("B3A" = "B3", "C3A" = "C3", "E2A" = "E2"))
  )

common_ids <- intersect(rownames(filtered), meta_clean$sampleID)
#length(common_ids)

#combine metadata and filtered
missing_ids <- setdiff(rownames(filtered), meta_clean$sampleID)
#missing_ids

filtered_sub <- filtered[common_ids, , drop = FALSE]

meta_sub <- meta_clean %>%
  filter(sampleID %in% common_ids) %>%
  arrange(match(sampleID, common_ids)) %>%     
  column_to_rownames("sampleID")              

combined <- cbind(meta_sub, filtered_sub)   #combined data with all columns

diet_col <- meta_clean %>% 
  filter(sampleID %in% common_ids) %>% 
  arrange(match(sampleID, common_ids)) %>%      
  pull(Diet)                                    

supp_col <- meta_clean %>% 
  filter(sampleID %in% common_ids) %>% 
  arrange(match(sampleID, common_ids)) %>%      
  pull(Supplement)                              

combined_diet <- cbind(
  Diet        = diet_col,
  Supplement  = supp_col,
  filtered_sub
)                              
 #combined data with Diet, Supplement
#filter out column according to names
cols_keep <- colnames(combined_diet) %in% c("Diet", "Supplement") | (
               grepl("\\|s_", colnames(combined_diet)) &  # must contain “|s_”, only leave species
               !grepl("t_",  colnames(combined_diet)) # not contain “|t_”, remove all of the taxa 
             ) 

combined_diet <- combined_diet[ , cols_keep]
#convert to numeric
combined_diet <- combined_diet %>%
  mutate(across(-c(Diet, Supplement), ~ as.numeric(as.character(.))))


#count zeros
non_meta_cols <- setdiff(names(combined_diet), c("Diet", "Supplement"))
zero_counts   <- colSums(combined_diet[ , non_meta_cols] == 0, na.rm = TRUE)

n_samples <- nrow(combined_diet)
thr       <- 0.10 * n_samples 

taxa_keep <- non_meta_cols[ zero_counts < thr ]

combined_diet <- combined_diet[ , c("Diet", "Supplement", taxa_keep) ]
mat <- as.matrix(combined_diet[ , -c(1,2)]) 

pseudo <- min(mat[mat > 0]) / 10        
mat[mat == 0] <- pseudo  

#pseudo

mat_prop <- sweep(mat, 1, rowSums(mat), "/")

combined_diet <- cbind(
  Diet        = combined_diet$Diet,
  Supplement  = combined_diet$Supplement,
  mat_prop
)

write.csv(
  combined_diet,
  file = "data/combined_diet.csv"
)

Alpha/ Beta Diversity

We first quantified within-sample (alpha) diversity using the Shannon index. For each sample, we computed Shannon diversity from the species-level relative abundance matrix and compared its distribution across weeks using boxplots with overlaid jittered points. We also stratified the analysis by diet group (Lean vs High-Fat) to visualize how alpha diversity changes over time under different dietary conditions.

To assess between-sample (beta) diversity, we calculated Bray–Curtis dissimilarity on the same abundance matrix. We then performed principal coordinates analysis (PCoA) on the Bray–Curtis distance matrix and plotted the first two axes, with points colored by week and shaped by diet. This provides a low-dimensional view of how microbial community composition varies across time and diet groups.

suppressMessages(library(vegan))
suppressMessages(library(ggplot2))
suppressMessages(library(dplyr))

combined_diet <- as.data.frame(combined_diet)

combined_diet$SampleID <- rownames(combined_diet)

combined_diet$Week <- sub(".*_(wk\\d+)$", "\\1", combined_diet$SampleID)
combined_diet$Week <- factor(combined_diet$Week, levels = c("wk1", "wk12", "wk14", "wk16"))

abundance_cols <- setdiff(colnames(combined_diet), c("Diet", "Supplement", "SampleID", "Week"))
abundance_data <- combined_diet[, abundance_cols]

abundance_data <- as.data.frame(sapply(abundance_data, as.numeric))


combined_diet$Shannon <- diversity(abundance_data, index = "shannon")


ggplot(combined_diet, aes(x = Week, y = Shannon)) +
  geom_boxplot(fill = "skyblue", outlier.shape = NA) +
  geom_jitter(width = 0.2, size = 1.5, alpha = 0.5) +
  labs(
    title = "Shannon Diversity Index across Weeks",
    x = "Week",
    y = "Shannon Index"
  ) +
  theme_minimal()

combined_lean <- combined_diet %>% filter(Diet == "Lean")
combined_hf   <- combined_diet %>% filter(Diet == "HighFat")


p1 <- ggplot(combined_lean, aes(x = Week, y = Shannon)) +
  geom_boxplot(fill = "lightgreen", outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.4, size = 1.5) +
  labs(title = "Lean Diet", x = "Week", y = "Shannon Index") +
  theme_minimal()


p2 <- ggplot(combined_hf, aes(x = Week, y = Shannon)) +
  geom_boxplot(fill = "lightyellow", outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.4, size = 1.5) +
  labs(title = "High-Fat Diet", x = "Week", y = "Shannon Index") +
  theme_minimal()


p1

p2

suppressMessages(library(vegan))
suppressMessages(library(dplyr))
suppressMessages(library(ggplot2))
suppressMessages(library(tibble))


abundance_data <- as.data.frame(sapply(abundance_data, as.numeric))


# Calsulate Bray–Curtis β-diversity
dist_bray <- vegan::vegdist(abundance_data, method = "bray")  # 或 vegdist(ab_hell, "bray")


pcoa_fit <- cmdscale(dist_bray, k = 2, eig = TRUE)
pcoa_df <- tibble(
  SampleID = combined_diet$SampleID,
  PC1 = pcoa_fit$points[, 1],
  PC2 = pcoa_fit$points[, 2]
) %>%
  left_join(combined_diet[, c("SampleID", "Diet", "Supplement", "Week")], by = "SampleID")


eig_vals <- pcoa_fit$eig[pcoa_fit$eig > 0]
var_exp <- eig_vals / sum(eig_vals)
pc1_var <- if (length(var_exp) >= 1) round(100 * var_exp[1], 1) else NA
pc2_var <- if (length(var_exp) >= 2) round(100 * var_exp[2], 1) else NA


ggplot(pcoa_df, aes(x = PC1, y = PC2, color = Week, shape = Diet)) +
  geom_point(size = 2, alpha = 0.9) +
  labs(
    title = paste0(
      "PCoA on Bray–Curtis β-diversity",
      if (!is.na(pc1_var) && !is.na(pc2_var))
        paste0(" (PC1 ", pc1_var, "%, PC2 ", pc2_var, "%)")
    ),
    x = "PCoA 1", y = "PCoA 2"
  ) +
  theme_minimal() +
  theme(legend.title = element_text(size = 10),
        legend.text  = element_text(size = 9))

Heatmap and Hierarchical Clustering

To visualize overall community structure, we first constructed a species-level proportion matrix from the processed MetaPhlAn data, with rows corresponding to samples and columns corresponding to taxa. As an initial exploratory step, we generated an unsupervised heatmap in which both samples and taxa were clustered, providing a global view of abundance patterns without any metadata overlays.

We then explored the appropriate number of sample clusters using hierarchical clustering on the taxa proportion matrix with Ward’s \(D^2\) linkage. Using the fviz_nbclust function, we examined three standard diagnostics—the within-cluster sum of squares (elbow), average silhouette width, and the gap statistic—across a range of cluster numbers. These criteria helped guide the choice of a low-dimensional cluster structure for summarizing between-sample differences.

For the final heatmap, we computed Bray–Curtis dissimilarity between samples and applied hierarchical clustering with Ward’s \(D^2\) method to order the columns. Species (rows) were clustered in the usual way to group taxa with similar abundance profiles. We added a top annotation bar to display key experimental variables: diet (Lean vs HighFat), supplement type (Cellulose vs Oligofructose), and sampling week (wk1, wk12, wk14, wk16), each encoded with a consistent color scheme. The resulting heatmap highlights major gradients in species composition while simultaneously showing how these patterns relate to diet, supplement, and time point. The plot was also saved as a high-resolution PNG file for use in reports and presentations.

combined_diet <- read.csv("data/combined_diet.csv",
                          header    = TRUE,  
                          row.names = 1,  
                          stringsAsFactors = FALSE)

suppressMessages(library(tidyverse))
suppressMessages(library(ComplexHeatmap))
suppressMessages(library(factoextra))
suppressMessages(library(cluster))   
suppressMessages(library(cowplot))

taxa_prop <- combined_diet %>%
  select(-Diet, -Supplement) %>%      
  as.matrix()

#initial heatmap

suppressMessages(
taxa_prop %>% 
  t() %>%               
  Heatmap(name            = "abundance",
          cluster_rows    = TRUE,
          cluster_columns = TRUE,
          column_labels   = rep(" ", nrow(taxa_prop)), 
          row_labels      = rep(" ", ncol(taxa_prop)),  
          row_names_gp    = gpar(fontsize = 1.5),
          column_title    = "Taxa Abundance Heatmap before clustering" )
)

cat("Elbow")
## Elbow
# Elbow
fviz_nbclust(taxa_prop, hcut, method = "wss", hc_method = "ward.D2")

# Silhouette 

cat("Silhouette")
## Silhouette
fviz_nbclust(taxa_prop, hcut, method = "silhouette", hc_method = "ward.D2")

# Gap Statistic 

cat("Gap Statistic")
## Gap Statistic
set.seed(1)
gap_stat <- clusGap(taxa_prop, FUN = hcut, K.max = 10, B = 50)
fviz_gap_stat(gap_stat)

bc_dist   <- vegdist(taxa_prop, method = "bray")
clust.res <- hclust(bc_dist, method = "ward.D2")
k         <- 2
my.clusts <- cutree(clust.res, k = k)

# cluster color
cluster_col <- setNames(c("orange", "green4"), c("1", "2"))


samples <- rownames(taxa_prop)

## Diet
diet_vec <- factor(
  combined_diet[samples, "Diet"],                 
  levels = c("Lean", "HighFat")
)

diet_col <- c(Lean = "#4f8ad1",   
              HighFat = "#d9534f")

## Week
get_week <- function(x) {
  if (grepl("_wk1$",  x, ignore.case = TRUE)) "wk1"
  else if (grepl("wk12", x, ignore.case = TRUE)) "wk12"
  else if (grepl("wk14", x, ignore.case = TRUE)) "wk14"
  else if (grepl("wk16", x, ignore.case = TRUE)) "wk16"
  else NA_character_
}

week_vec <- factor(
  vapply(samples, get_week, character(1)),
  levels = c("wk1", "wk12", "wk14", "wk16")
)

week_col <- c(wk1  = "#0080ff",   
              wk12 = "#ffcf00",   
              wk14 = "#ff66b2",   
              wk16 = "#00a651")   


#Supplement
supp_vec <- factor(
  combined_diet[samples, "Supplement"],
  levels = c("Cellulose", "Oligofructose")
)

supp_col <- c(Cellulose      = "#8da0cb",
              Oligofructose  = "#fc8d62")


#annotation
top_anno <- HeatmapAnnotation(
  supp = supp_vec,
  diet = diet_vec,
  week = week_vec,
  col  = list(
    diet = diet_col,
    week = week_col,
    supp = supp_col
  ),
  annotation_name_side = "left",
  annotation_height    = unit(c(4, 4, 4), "mm")
)

suppressMessages(
ht <- taxa_prop %>%
  t() %>%                           
  Heatmap(
    name                        = "abundance",
    clustering_method_columns   = "ward.D2",
    clustering_distance_columns = function(M) vegdist(M, "bray"),
    column_labels               = rep(" ", nrow(taxa_prop)),   
    row_labels                  = rep(" ", ncol(taxa_prop)),  
    top_annotation              = top_anno,
    column_title    = "Taxa Abundance Heatmap Hierarchical clustering" 
  )
)
draw(ht)

png("~/Desktop/heatmap.png", width = 2400, height = 1800, res = 300)
draw(ht)
invisible(dev.off())

CLR Transformation

suppressMessages(library(tidyverse))
suppressMessages(library(compositions))   

combined_diet <- read.csv(
  "data/combined_diet.csv",
  header           = TRUE,
  row.names        = 1,
  stringsAsFactors = FALSE
)

diet_col <- combined_diet$Diet
supp_col  <- combined_diet$Supplement

taxa_mat <- combined_diet %>% 
  select(-Diet, -Supplement) %>%        
  as.matrix()

clr_mat <- clr(taxa_mat)   

combined_clr <- data.frame(
  Diet = diet_col,
  Supplement = supp_col,
  clr_mat,
  check.names = FALSE
)


row_sums <- rowSums(clr_mat)

Linear Mixed Model for all samples

#LMM and BH

suppressMessages(library(nlme))         

combined_clr <- combined_clr %>%
  rownames_to_column("SampleID") %>% 
  separate(SampleID,
           into   = c("Batch", "MouseID", "Week"),
           sep    = "_",
           remove = FALSE) %>% 
  mutate(
    MouseID = factor(MouseID),
    Week    = factor(Week, levels = c("wk1", "wk12", "wk14", "wk16")),
    Diet    = factor(Diet, levels = c("Lean", "HighFat")),
    Supplement = factor(Supplement, levels = c("Cellulose","Oligofructose"))
  )

#create an empty matrix
taxa_cols <- setdiff(colnames(combined_clr),
                     c("SampleID","Batch","MouseID","Week","Diet","Supplement"))

n_taxa <- length(taxa_cols)
results_mat <- matrix(NA, nrow = n_taxa, ncol = 3,
                      dimnames = list(taxa_cols,
                                      c("t_value", "p_value", "p_adj")))

for (i in seq_along(taxa_cols)) {
  taxon <- taxa_cols[i]

  df_tmp <- combined_clr %>%
    select(Abundance = all_of(taxon), Diet, Supplement, Week, MouseID)
  
  mod <- lme(fixed   = Abundance ~ Diet + Supplement + Week, #control Supplement and Week
             random  = ~1 | MouseID,
             data    = df_tmp,
             na.action = na.exclude,
             method  = "ML")  
  
#get t and p
  tt <- summary(mod)$tTable
  results_mat[i, "t_value"] <- tt["DietHighFat", "t-value"]
  results_mat[i, "p_value"] <- tt["DietHighFat", "p-value"]
}

results_mat[, "p_adj"] <- p.adjust(results_mat[, "p_value"], "BH")

lmm_results <- as.data.frame(results_mat) %>%
  rownames_to_column("Taxon") %>%
  arrange(p_adj)


write.csv(lmm_results,
           "data/lmm_taxa.csv",
           row.names = FALSE)



sig_tbl <- lmm_results %>% 
  filter(p_adj < 0.05) %>%                   
  select(Taxon, t_value, p_value, p_adj) %>%  
  arrange(p_adj)                             
combined_clr <- combined_clr %>% 
  mutate(Group = interaction(Diet, Supplement, sep = "+", drop = TRUE)) %>% 
  relocate(Group, .after = Supplement) 
write.csv(combined_clr,
          "data/combined_clr.csv",
          row.names = TRUE)

Lasso Regression

## ## LASSO by Week
## ### wk1
## 
## **Number of selected taxa:** 7

## ### wk12
## 
## **Number of selected taxa:** 8

## ### wk14
## 
## **Number of selected taxa:** 9

## ### wk16
## 
## **Number of selected taxa:** 6

cat("Lasso across all weeks")
## Lasso across all weeks
suppressMessages(library(tidyverse))
suppressMessages(library(glmnet))    
set.seed(1)       


y <- ifelse(combined_clr$Diet == "HighFat", 1, 0)          # 0/1 outcome

X <- combined_clr %>%                                    
  select(-SampleID, -Batch, -MouseID, -Week,
         -Diet, -Supplement, -Group) %>%                 
  as.matrix()

cvfit <- cv.glmnet(
  x       = X,
  y       = y,
  family  = "binomial",  
  alpha   = 1,           
  nfolds  = 10,
  standardize = TRUE
)

lambda_opt <- cvfit$lambda.1se 

# Diet ~ CLRtransformed abundance
fit <- glmnet(
  x           = X,
  y           = y,
  family      = "binomial",
  alpha       = 1,
  lambda      = lambda_opt,
  standardize = TRUE
)

coef_vec <- coef(fit)                   
sel_idx  <- which(coef_vec[-1, 1] != 0) 
selected_taxa <- rownames(coef_vec)[-1][sel_idx]

lasso_tbl <- tibble(
  Taxon      = selected_taxa,
  Coefficient = as.numeric(coef_vec[-1, 1][sel_idx]),
  Direction  = ifelse(Coefficient > 0, "HighFat↑", "Lean↑")
) %>% arrange(desc(abs(Coefficient)))

#print(lasso_tbl)

write.csv(lasso_tbl,
          "data/lasso_selected_taxa.csv",
          row.names = FALSE)

#cat("Number of selected taxa:", nrow(lasso_tbl), "\n")

Number of significant taxa: 16

coef_vec <- coef(fit)
coef_df <- data.frame(
  Taxon = rownames(coef_vec),
  Coefficient = as.numeric(coef_vec[, 1])
) %>%
  filter(Taxon != "(Intercept)")

#coef_df
library(stringr)
library(ggplot2)

lasso_tbl <- lasso_tbl %>%
  mutate(
    ShortTaxon = str_extract(Taxon, "[^\\.]+$")  
  )
ggplot(lasso_tbl, aes(x = reorder(ShortTaxon, Coefficient), y = Coefficient, fill = Direction)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Lasso-Selected Taxa",
    x = "Taxon",
    y = "Lasso Coefficient"
  ) +
  scale_fill_manual(values = c("HighFat↑" = "tomato", "Lean↑" = "skyblue")) +
  theme_minimal(base_size = 12)

library(ggpubr)

selected_taxa <- lasso_tbl$Taxon 

long_df <- combined_diet %>%
  select(Diet, all_of(selected_taxa)) %>%  
  pivot_longer(
    cols = -Diet,
    names_to = "Taxon",
    values_to = "Abundance"
  )

ggplot(long_df, aes(x = Diet, y = Abundance, fill = Diet)) +
  geom_violin(trim = FALSE, alpha = 0.4, scale = "width") +
  geom_jitter(width = 0.2, size = 1.2, alpha = 0.4, color = "black") +
  stat_compare_means(method = "wilcox.test", label = "p.signif") + 
  facet_wrap(~ Taxon, scales = "free_y", ncol = 4) +
  labs(
    title = "Abundance of Lasso-Selected Taxa by Diet",
    x = "Diet",
    y = "Abundance"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    strip.text = element_text(size = 9, face = "bold"),
    legend.position = "top"
  )

ggsave("lasso_violin_plot.png", width = 16, height = 16, dpi = 300)

#selected_taxa

Predictive Modeling

suppressMessages(library(tidyverse))
suppressMessages(library(glmnet))
suppressMessages(library(knitr))
suppressMessages(library(kableExtra))

set.seed(1)

weeks <- c("wk1", "wk12", "wk14", "wk16")

taxa_cols <- setdiff(
  colnames(combined_clr),
  c("SampleID","Batch","MouseID","Week","Diet","Supplement","Group")
)

n_repeats <- 50 
results_list <- list()

for (wk in weeks) {
  df_w <- combined_clr %>% dplyr::filter(.data$Week == wk)

  n_samples <- nrow(df_w)
  diet_tab  <- table(df_w$Diet)
  lean_n    <- unname(ifelse("Lean"    %in% names(diet_tab), diet_tab["Lean"],    0))
  hf_n      <- unname(ifelse("HighFat" %in% names(diet_tab), diet_tab["HighFat"], 0))


  if (n_samples < 10 || length(unique(df_w$Diet)) < 2) {
    results_list[[wk]] <- tibble(
      Week = wk, N = n_samples, Lean = as.integer(lean_n), HighFat = as.integer(hf_n),
      Train_Error_Mean = NA_real_, Train_Error_SD = NA_real_,
      Test_Error_Mean  = NA_real_, Test_Error_SD  = NA_real_
    )
    next
  }

  y <- ifelse(df_w$Diet == "HighFat", 1, 0)
  X <- as.matrix(df_w[, taxa_cols, drop = FALSE])


  nzv <- apply(X, 2, sd, na.rm = TRUE) > 0
  X   <- X[, nzv, drop = FALSE]

  if (ncol(X) == 0) {
    results_list[[wk]] <- tibble(
      Week = wk, N = n_samples, Lean = as.integer(lean_n), HighFat = as.integer(hf_n),
      Train_Error_Mean = NA_real_, Train_Error_SD = NA_real_,
      Test_Error_Mean  = NA_real_, Test_Error_SD  = NA_real_
    )
    next
  }

  train_errs <- numeric(n_repeats)
  test_errs  <- numeric(n_repeats)

  for (r in seq_len(n_repeats)) {
    set.seed(r)
    n <- nrow(X)
    tr_idx <- sample(seq_len(n), size = floor(0.8 * n))

    X_tr <- X[tr_idx, , drop = FALSE]
    y_tr <- y[tr_idx]
    X_te <- X[-tr_idx, , drop = FALSE]
    y_te <- y[-tr_idx]

    cvfit <- cv.glmnet(
      x = X_tr, y = y_tr,
      family = "binomial", alpha = 1,
      nfolds = 10, standardize = TRUE
    )
    fit <- glmnet(
      x = X_tr, y = y_tr,
      family = "binomial", alpha = 1,
      lambda = cvfit$lambda.1se, standardize = TRUE
    )

    pred_tr <- ifelse(predict(fit, X_tr, type = "response") > 0.5, 1, 0)
    pred_te <- ifelse(predict(fit, X_te, type = "response") > 0.5, 1, 0)

    train_errs[r] <- mean(pred_tr != y_tr)
    test_errs[r]  <- mean(pred_te != y_te)
  }

  results_list[[wk]] <- tibble(
    Week = wk,
    N = n_samples,
    Lean = as.integer(lean_n),
    HighFat = as.integer(hf_n),
    `Train Error (mean)` = mean(train_errs),
    `Train Error (sd)`   = sd(train_errs),
    `Test Error (mean)`  = mean(test_errs),
    `Test Error (sd)`    = sd(test_errs)
  )
}

results_tbl <- bind_rows(results_list) %>%
  dplyr::mutate(
    Week = factor(Week, levels = c("wk1","wk12","wk14","wk16"))
  ) %>%
  dplyr::arrange(Week) %>%
  dplyr::mutate(
    dplyr::across(where(is.numeric), ~ round(., 3))
  )

knitr::kable(
  results_tbl,
  format  = "html",
  caption = "Predictive modeling by week (LASSO, 50 repeats of 80/20 splits)",
  align   = "lrrrrrrrr"
) %>%
  kableExtra::kable_styling(
    full_width = FALSE,
    bootstrap_options = c("striped", "hover", "condensed", "responsive")
  )
Predictive modeling by week (LASSO, 50 repeats of 80/20 splits)
Week N Lean HighFat Train Error (mean) Train Error (sd) Test Error (mean) Test Error (sd)
wk1 26 14 12 0.000 0.000 0.003 0.024
wk12 29 14 15 0.001 0.006 0.033 0.095
wk14 29 14 15 0.001 0.006 0.093 0.162
wk16 28 14 14 0.032 0.036 0.133 0.178
suppressMessages(library(glmnet))
suppressMessages(library(tidyverse))
suppressMessages(library(knitr))
suppressMessages(library(kableExtra))

set.seed(1)

n_repeats <- 50
train_error_vec <- numeric(n_repeats)
test_error_vec  <- numeric(n_repeats)

for (r in seq_len(n_repeats)) {
  set.seed(r)
 
  n <- nrow(X)
  train_idx <- sample(1:n, size = 0.8 * n)
  X_train <- X[train_idx, ]
  y_train <- y[train_idx]
  X_test  <- X[-train_idx, ]
  y_test  <- y[-train_idx]

  cvfit_train <- cv.glmnet(
    x = X_train,
    y = y_train,
    family = "binomial",
    alpha = 1,
    nfolds = 10,
    standardize = TRUE
  )
  lambda_opt <- cvfit_train$lambda.1se

  fit_train <- glmnet(
    x = X_train,
    y = y_train,
    family = "binomial",
    alpha = 1,
    lambda = lambda_opt,
    standardize = TRUE
  )

  yhat_train <- predict(fit_train, X_train, type = "response")
  yhat_test  <- predict(fit_train, X_test,  type = "response")

  pred_train <- ifelse(yhat_train > 0.5, 1, 0)
  pred_test  <- ifelse(yhat_test  > 0.5, 1, 0)

  train_error <- mean(pred_train != y_train)
  test_error  <- mean(pred_test  != y_test)


  train_error_vec[r] <- train_error
  test_error_vec[r]  <- test_error
}

results_tbl <- tibble(
  N = 112,
  `Train Error (mean)` = mean(train_error_vec),
  `Train Error (sd)`   = sd(train_error_vec),
  `Test Error (mean)`  = mean(test_error_vec),
  `Test Error (sd)`    = sd(test_error_vec),
  `Null Model Error`   = 0.5
) |>
  mutate(across(where(is.numeric), ~ round(., 3)))

knitr::kable(
  results_tbl,
  format  = "html",
  caption = "Predictive modeling across all weeks(LASSO, 50 repeats of 80/20 splits)",
  align   = "rrrrrr"
) |>
  kableExtra::kable_styling(
    full_width = FALSE,
    bootstrap_options = c("striped", "hover", "condensed", "responsive")
  )
Predictive modeling across all weeks(LASSO, 50 repeats of 80/20 splits)
N Train Error (mean) Train Error (sd) Test Error (mean) Test Error (sd) Null Model Error
112 0.032 0.036 0.133 0.178 0.5

Cooccurrence Plot

suppressMessages(library(igraph))
suppressMessages(library(stringr))
suppressMessages(library(dplyr))

stopifnot(exists("combined_clr"), exists("weeks"))
stopifnot(exists("lasso_selected_by_week")) 

rescale01 <- function(x, to = c(0.6, 3.8)) {
  r <- range(x, na.rm = TRUE)
  if (!is.finite(r[1]) || !is.finite(r[2]) || r[2] - r[1] == 0) {
    return(rep(mean(to), length(x)))
  }
  (x - r[1]) / (r[2] - r[1]) * (to[2] - to[1]) + to[1]
}

# EBIC
ebic <- function(rss, df, n, p, gamma = 0.5) n*log(rss/n) + df*log(n) + 2*gamma*df*log(p)
gamma <- 0.5
tau   <- 0  

for (wk in weeks) {
  cat("\n### Week:", wk, "\n")

  sel_names <- lasso_selected_by_week[[wk]]
  if (is.null(sel_names) || length(sel_names) <= 1) {
    cat("No significant taxa in this week; skipped.\n")
    next
  }

  df_w <- combined_clr %>% dplyr::filter(.data$Week == wk)

  sel_names <- intersect(sel_names, colnames(df_w))
  if (length(sel_names) <= 1) {
    cat("Selected taxa not present in this week; skipped.\n")
    next
  }

  X <- as.matrix(df_w[, sel_names, drop = FALSE])
  n <- nrow(X); p <- ncol(X)
  colnames(X) <- sel_names


  set.seed(1)
  B <- matrix(0, p, p, dimnames = list(colnames(X), colnames(X)))
  for (i in seq_len(p)) {
    y  <- X[, i]
    Xo <- X[, -i, drop = FALSE]
    fit  <- glmnet::glmnet(x = Xo, y = y, alpha = 1, family = "gaussian", standardize = TRUE)
    Yhat <- predict(fit, Xo)
    rss  <- colSums((y - Yhat)^2)
    k    <- which.min(ebic(rss, fit$df, n, p - 1, gamma = gamma))
    co   <- as.numeric(coef(fit, s = fit$lambda[k]))[-1]
    B[i, -i] <- co
  }


  W <- (B + t(B)) / 2
  diag(W) <- 0

  adj <- (W != 0) * 1
  g   <- igraph::graph_from_adjacency_matrix(adj, mode = "undirected", diag = FALSE)

  get_w <- function(W, g) {
    el <- igraph::as_edgelist(g, names = FALSE)
    apply(el, 1, function(ix) W[ix[1], ix[2]])
  }
  w <- get_w(W, g)


  keep_e <- which(abs(w) >= tau)
  if (length(keep_e) == 0) {
    cat(sprintf("No edges with |coef| >= %.2f; skipped.\n", tau))
    next
  }
  g_tau <- igraph::subgraph.edges(g, eids = keep_e, delete.vertices = FALSE)
  w_tau <- get_w(W, g_tau)


  coords <- igraph::layout_in_circle(g_tau)
  theta  <- atan2(coords[,2], coords[,1])

  short_names <- sapply(igraph::V(g_tau)$name, function(x){
    sp <- stringr::str_extract(x, "(?<=\\.s__)[^\\._]+")
    if (!is.na(sp) && nzchar(sp)) return(sp)
    ge <- stringr::str_extract(x, "(?<=\\.g__)[^\\._]+")
    if (!is.na(ge) && nzchar(ge)) return(ge)
    tail(strsplit(x, "_", fixed = TRUE)[[1]], 1)
  })

  ang_deg_raw <- theta * 180 / pi
  flip  <- (ang_deg_raw > 90 & ang_deg_raw < 270)
  ang_deg <- ifelse(flip, ang_deg_raw - 180, ang_deg_raw)
  hjust   <- ifelse(flip, 1, 0)

  #plot
  e_width <- rescale01(abs(w_tau), to = c(0.6, 4.0))
  e_color <- ifelse(w_tau >= 0, "steelblue", "firebrick")
  e_curve <- 0.15
  node_cols <- grDevices::rainbow(igraph::vcount(g_tau), s = 0.9, v = 0.9)

  enlarge   <- 1.30
  lab_push  <- 1.08
  label_cex <- 0.55
  layout_scaled <- coords * enlarge
  lab_xy_scaled <- cbind(coords[,1]*enlarge*lab_push, coords[,2]*enlarge*lab_push)
  lim <- 1.02 * enlarge * lab_push

  op <- par(mar = c(3,3,5,3), xpd = NA, xaxs = "i", yaxs = "i")
  plot(
    g_tau,
    layout = layout_scaled,
    vertex.size = 4.2,
    vertex.label = NA,
    vertex.color = node_cols,
    vertex.frame.color = NA,
    edge.width = e_width,
    edge.color = e_color,
    edge.curved = e_curve,
    rescale = FALSE,
    xlim = c(-lim, lim), ylim = c(-lim, lim),
    asp = 1
  )
  title(sprintf("Week %s Taxa Co-occurrence |coef| ≥ %.2f", wk, tau), line = 3)

  for (i in seq_len(igraph::vcount(g_tau))) {
    text(lab_xy_scaled[i,1], lab_xy_scaled[i,2], short_names[i],
         srt = ang_deg[i], adj = hjust[i],
         cex = label_cex, col = "black")
  }
  par(op)
}
## 
## ### Week: wk1

## 
## ### Week: wk12

## 
## ### Week: wk14

## 
## ### Week: wk16

suppressMessages(library(igraph))
suppressMessages(library(stringr))


rescale01 <- function(x, to = c(0.6, 3.8)) {
  r <- range(x, na.rm = TRUE)
  if (!is.finite(r[1]) || !is.finite(r[2]) || r[2] - r[1] == 0) {
    return(rep(mean(to), length(x)))
  }
  (x - r[1]) / (r[2] - r[1]) * (to[2] - to[1]) + to[1]
}


groups <- unique(combined_clr$Group)

for (gname in groups) {
  cat("\n### Group:", gname, "\n")
  

  df_g <- combined_clr %>% dplyr::filter(Group == gname)
  
  sel_names <- NULL
  if (exists("selected_taxa")) sel_names <- selected_taxa
  if (is.null(sel_names) && exists("lasso_tbl")) sel_names <- unique(lasso_tbl$Taxon)
  
  if (is.null(sel_names) || length(sel_names) <= 1) {
    cat("No significant taxa for this group; skipped.\n")
    next
  }
  

  X <- as.matrix(df_g[, sel_names, drop = FALSE])
  colnames(X) <- sel_names
  n <- nrow(X); p <- ncol(X)
  

  ebic <- function(rss, df, n, p, gamma = 0.5) {
    n*log(rss/n) + df*log(n) + 2*gamma*df*log(p)
  }
  gamma <- 0.5
  
  B <- matrix(0, p, p, dimnames = list(colnames(X), colnames(X)))
  for (i in seq_len(p)) {
    y  <- X[, i]
    Xo <- X[, -i, drop = FALSE]
    fit  <- glmnet(x = Xo, y = y, alpha = 1, family = "gaussian", standardize = TRUE)
    Yhat <- predict(fit, Xo)
    rss  <- colSums((y - Yhat)^2)
    k    <- which.min(ebic(rss, fit$df, n, p - 1, gamma = gamma))
    co   <- as.numeric(coef(fit, s = fit$lambda[k]))[-1]
    B[i, -i] <- co
  }
  
  W <- (B + t(B)) / 2
  diag(W) <- 0
  

  adj <- (W != 0) * 1
  g   <- graph_from_adjacency_matrix(adj, mode = "undirected", diag = FALSE)
  
  get_w <- function(W, g) {
    el <- as_edgelist(g, names = FALSE)
    apply(el, 1, function(ix) W[ix[1], ix[2]])
  }
  w        <- get_w(W, g)
  e_width  <- rescale01(abs(w), to = c(0.6, 4.0))
  e_color  <- ifelse(w >= 0, "steelblue", "firebrick")
  e_curve  <- 0.15
  node_cols <- grDevices::rainbow(vcount(g), s = 0.9, v = 0.9)
  
  coords <- layout_in_circle(g)
  theta  <- atan2(coords[,2], coords[,1])
  short_names <- sapply(V(g)$name, function(x){
    sp <- str_extract(x, "(?<=\\.s__)[^\\._]+")
    if (!is.na(sp) && nzchar(sp)) return(sp)
    ge <- str_extract(x, "(?<=\\.g__)[^\\._]+")
    if (!is.na(ge) && nzchar(ge)) return(ge)
    tail(strsplit(x, "_", fixed = TRUE)[[1]], 1)
  })
  flip  <- (theta * 180 / pi > 90 & theta * 180 / pi < 270)
  ang_deg <- ifelse(flip, theta*180/pi - 180, theta*180/pi)
  hjust   <- ifelse(flip, 1, 0)
  

  tau   <- 0
  keep_e <- which(abs(w) >= tau)
  g_tau  <- subgraph.edges(g, eids = keep_e, delete.vertices = FALSE)
  w_tau  <- get_w(W, g_tau)
  
  enlarge   <- 1.30
  lab_push  <- 1.08
  label_cex <- 0.55
  layout_scaled <- coords * enlarge
  lab_xy_scaled <- cbind(coords[,1]*enlarge*lab_push, coords[,2]*enlarge*lab_push)
  lim <- 1.02 * enlarge * lab_push
  
  op <- par(mar = c(3,3,5,3), xpd = NA, xaxs = "i", yaxs = "i")
  plot(
    g_tau,
    layout = layout_scaled,
    vertex.size = 4.2,
    vertex.label = NA,
    vertex.color = node_cols,
    vertex.frame.color = NA,
    edge.width = rescale01(abs(w_tau), to = c(0.6, 4.0)),
    edge.color = ifelse(w_tau >= 0, "steelblue", "firebrick"),
    edge.curved = e_curve,
    rescale = FALSE,
    xlim = c(-lim, lim), ylim = c(-lim, lim),
    asp = 1
  )
  title(sprintf("Group: %s Taxa Co-occurrence |coef| ≥ %.2f", gname, tau), line = 3)
  
  for (i in seq_len(vcount(g_tau))) {
    text(lab_xy_scaled[i,1], lab_xy_scaled[i,2], short_names[i],
         srt = ang_deg[i], adj = hjust[i],
         cex = label_cex, col = "black")
  }
  par(op)
}
## 
## ### Group: Lean+Cellulose

## 
## ### Group: HighFat+Cellulose

## 
## ### Group: Lean+Oligofructose

## 
## ### Group: HighFat+Oligofructose

suppressMessages(library(dplyr))
suppressMessages(library(stringr))

sel_names <- NULL
if (exists("selected_taxa")) sel_names <- selected_taxa
if (is.null(sel_names) && exists("lasso_tbl")) sel_names <- unique(lasso_tbl$Taxon)
stopifnot("no available taxa" = length(sel_names) > 1)
message("Using ", length(sel_names), " taxa")


X <- as.matrix(combined_clr[, sel_names, drop = FALSE])
colnames(X) <- sel_names
n <- nrow(X); p <- ncol(X)


ebic <- function(rss, df, n, p, gamma = 0.5) n*log(rss/n) + df*log(n) + 2*gamma*df*log(p)
gamma <- 0.5    
set.seed(1)


B <- matrix(0, p, p, dimnames = list(colnames(X), colnames(X)))

for (i in seq_len(p)) {
  y  <- X[, i]
  Xo <- X[, -i, drop = FALSE]


  fit  <- glmnet(x = Xo, y = y, alpha = 1, family = "gaussian", standardize = TRUE)

  Yhat <- predict(fit, Xo)                      
  rss  <- colSums((y - Yhat)^2)
  k    <- which.min(ebic(rss, fit$df, n, p - 1, gamma = gamma))


  co   <- as.numeric(coef(fit, s = fit$lambda[k]))[-1]
  B[i, -i] <- co
}

W <- (B + t(B)) / 2
diag(W) <- 0

idx <- which(W != 0, arr.ind = TRUE)
edges_df <- data.frame(
  from = colnames(W)[idx[,1]],
  to   = colnames(W)[idx[,2]],
  coef = W[idx],
  stringsAsFactors = FALSE
)


edges_df <- edges_df[edges_df$from < edges_df$to, , drop = FALSE]


edges_df <- edges_df[order(-abs(edges_df$coef)), , drop = FALSE]

#Taxa: 16 #Non-zero edges: 42

library(igraph)
library(stringr)

rescale01 <- function(x, to = c(0.6, 3.8)) {
  r <- range(x, na.rm = TRUE)
  if (!is.finite(r[1]) || !is.finite(r[2]) || r[2] - r[1] == 0) {
    return(rep(mean(to), length(x)))
  }
  (x - r[1]) / (r[2] - r[1]) * (to[2] - to[1]) + to[1]
}

stopifnot(exists("W"), is.matrix(W), !is.null(colnames(W)), nrow(W) == ncol(W))


adj <- (W != 0) * 1
g   <- graph_from_adjacency_matrix(adj, mode = "undirected", diag = FALSE)


get_w <- function(W, g) {
  el <- as_edgelist(g, names = FALSE)
  apply(el, 1, function(ix) W[ix[1], ix[2]])
}
w        <- get_w(W, g)
e_width  <- rescale01(abs(w), to = c(0.6, 4.0))      # higher correlation if wider
e_color  <- ifelse(w >= 0, "steelblue", "firebrick")  # positive: blue, negative: red
e_curve  <- 0.15


set.seed(1)
node_cols <- grDevices::rainbow(vcount(g), s = 0.9, v = 0.9)

coords <- layout_in_circle(g)
theta  <- atan2(coords[,2], coords[,1])

short_names <- sapply(V(g)$name, function(x){
  sp <- str_extract(x, "(?<=\\.s__)[^\\._]+")
  if (!is.na(sp) && nzchar(sp)) return(sp)
  ge <- str_extract(x, "(?<=\\.g__)[^\\._]+")
  if (!is.na(ge) && nzchar(ge)) return(ge)
  tail(strsplit(x, "_", fixed = TRUE)[[1]], 1)
})

lab_r  <- 1.12
lab_xy <- cbind(coords[,1]*lab_r, coords[,2]*lab_r)

ang_deg_raw <- theta * 180 / pi
flip        <- (ang_deg_raw > 90 & ang_deg_raw < 270)
ang_deg     <- ifelse(flip, ang_deg_raw - 180, ang_deg_raw)
hjust       <- ifelse(flip, 1, 0)

enlarge   <- 1.30     
lab_push  <- 1.08   
label_cex <- 0.55   

layout_scaled <- coords * enlarge
lab_xy_scaled <- cbind(coords[,1]*enlarge*lab_push, coords[,2]*enlarge*lab_push)
lim <- 1.02 * enlarge * lab_push  

op <- par(mar = c(3,3,5,3), xpd = NA, xaxs = "i", yaxs = "i")
set.seed(123)
plot(
  g,
  layout = layout_scaled,
  vertex.size = 4.2,
  vertex.label = NA,
  vertex.color = node_cols,
  vertex.frame.color = NA,
  edge.width = e_width,
  edge.color = e_color,
  edge.curved = e_curve,
  rescale = FALSE,                        
  xlim = c(-lim, lim), ylim = c(-lim, lim),
  asp = 1                              
)
title(main = "Taxa Co-occurrence across all weeks with |coef| ≥ 0", line = 4)

for (i in seq_len(vcount(g))) {
  text(
    x = lab_xy_scaled[i, 1],
    y = lab_xy_scaled[i, 2],
    labels = short_names[i],
    srt = ang_deg[i],
    adj = hjust[i],
    cex = label_cex,                    
    col = "black"
  )
}

par(op)

plot_circle_net <- function(g_obj, w_vec, title_txt,
                            enlarge = 1.30, lab_push = 1.08,
                            label_cex = 0.55, top_mar = 5) {

  e_width <- rescale01(abs(w_vec), to = c(0.6, 4.0))
  e_color <- ifelse(w_vec >= 0, "steelblue", "firebrick")

  layout_scaled <- coords * enlarge
  lab_xy_scaled <- cbind(coords[,1]*enlarge*lab_push, coords[,2]*enlarge*lab_push)
  lim <- 1.02 * enlarge * lab_push

  op <- par(mar = c(3,3,top_mar,3), xpd = NA, xaxs = "i", yaxs = "i")
  plot(
    g_obj,
    layout = layout_scaled,
    vertex.size = 4.2,
    vertex.label = NA,
    vertex.color = node_cols,
    vertex.frame.color = NA,
    edge.width = e_width,
    edge.color = e_color,
    edge.curved = e_curve,
    rescale = FALSE,                      # ★ 放大生效
    xlim = c(-lim, lim), ylim = c(-lim, lim),
    asp = 1
  )
  title(main = title_txt, line = 2)

  for (i in seq_len(vcount(g_obj))) {
    text(lab_xy_scaled[i,1], lab_xy_scaled[i,2], short_names[i],
         srt = ang_deg[i], adj = hjust[i],
         cex = label_cex, col = "black")
  }
  par(op)
}


tau   <- 0.1
keep_e <- which(abs(w) >= tau)
g_tau  <- subgraph.edges(g, eids = keep_e, delete.vertices = FALSE)
w_tau  <- get_w(W, g_tau)

plot_circle_net(
  g_obj     = g_tau,
  w_vec     = w_tau,
  title_txt = "", 
  enlarge   = 1.30,
  lab_push  = 1.08,
  label_cex = 0.55,
  top_mar   = 6
)
title(sprintf("Taxa Co-occurrence across all weeks with |coef| ≥ %.2f", tau), line = 2.5)